Studying Frequency on Corpus

This is just to check what type of letters come in a determinate corpus and study the frequency distribution. This will serve as a template to study the structure of the text with more traditional methods before using the nexa machiner


In [1]:
import numpy as np
import matplotlib.pyplot as plt
from nltk.book import text7 as text
import nltk
import string


*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908

In [2]:
print('Number of words', len(text))


Number of words 100676

In [3]:
text[0:10]


Out[3]:
['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the']

As it can seen from the text above the text is only a list of words. Let's count the number of words and then concatenate to count the number of actual letters.


In [4]:
print('Number of words', len(text))


Number of words 100676

In [5]:
concatenated_text = ' '.join(text)

In [6]:
concatenated_text[0:50]  # This is an ugly way to concatenate


Out[6]:
'Pierre Vinken , 61 years old , will join the board'

In [7]:
distribution = nltk.FreqDist(concatenated_text)

In [8]:
# fig = plt.gcf()
# fig.set_size_inches((32, 24))
distribution.plot()

In [9]:
distribution.pprint()


FreqDist({' ': 100675, 'e': 46345, 't': 33337, 'a': 32082, 'i': 28722, 'o': 28612, 'n': 28250, 's': 26853, 'r': 26391, 'l': 15804, ...})

We see that there are a lot of strange characters. We are going to transform to lowercase all the letters and then get the distribution again


In [11]:
text_lowercase = [letter.lower() for letter in concatenated_text]

In [12]:
distribution_low = nltk.FreqDist(text_lowercase)

In [13]:
distribution_low.pprint()


FreqDist({' ': 100675, 'e': 46871, 't': 36465, 'a': 33159, 'i': 29858, 'o': 28980, 'n': 28884, 's': 28382, 'r': 27336, 'l': 16349, ...})

In [14]:
import seaborn as sns
distribution_low.plot()


/home/heberto/miniconda/envs/nexa/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

In [15]:
distribution_low.keys()


Out[15]:
dict_keys(['l', '6', 'q', '7', 'a', 'p', 'u', '-', 'n', ':', '0', '@', 'f', 'x', '1', 's', '\\', 'm', '$', 'b', '4', 'g', 'v', 'o', 'i', 'y', ';', 'd', '8', 'j', 'c', '5', '#', 'w', 'r', ' ', "'", '?', '2', 'k', '/', '9', '!', 't', '%', '.', 'z', 'h', 'e', '*', ',', '&', '3', '`'])

Bigrams.

Now we will calculate the bigrams and their frequency for this text.


In [28]:
bigrams = nltk.bigrams(text_lowercase)

In [29]:
bigrams_freq = nltk.FreqDist(bigrams)

This however contains pairs with whitespaces as the most common frequency.


In [30]:
bigrams_freq.most_common(10)


Out[30]:
[(('e', ' '), 13786),
 (('s', ' '), 11883),
 ((' ', 't'), 11397),
 ((' ', 'a'), 8657),
 (('t', 'h'), 8649),
 (('i', 'n'), 8477),
 (('d', ' '), 7610),
 (('t', ' '), 7528),
 (('h', 'e'), 7323),
 (('n', ' '), 7270)]

In [37]:
bigrams_freq.plot(20)

We can remove them by using a standar dictionary looping technique


In [32]:
bigrams_dist_without_space = {key:value for key,value in bigrams_freq.items() if ' ' not in key}

In [33]:
bigrams_dist_without_space


Out[33]:
{('%', '-'): 2,
 ('&', 'l'): 2,
 ('&', 'p'): 35,
 ('&', 't'): 5,
 ("'", "'"): 684,
 ("'", '3'): 1,
 ("'", '4'): 1,
 ("'", '5'): 1,
 ("'", '8'): 3,
 ("'", 'a'): 1,
 ("'", 'b'): 2,
 ("'", 'c'): 2,
 ("'", 'd'): 5,
 ("'", 'l'): 8,
 ("'", 'm'): 14,
 ("'", 'n'): 1,
 ("'", 'r'): 29,
 ("'", 's'): 866,
 ("'", 't'): 325,
 ("'", 'v'): 16,
 ('*', '-'): 3738,
 ('*', '?'): 45,
 ('*', 'e'): 44,
 ('*', 'i'): 122,
 ('*', 'n'): 1,
 ('*', 'p'): 7,
 ('*', 'r'): 41,
 ('*', 't'): 1608,
 ('*', 'u'): 744,
 (',', '0'): 157,
 (',', '1'): 5,
 (',', '2'): 10,
 (',', '3'): 8,
 (',', '4'): 7,
 (',', '5'): 16,
 (',', '6'): 6,
 (',', '7'): 3,
 (',', '8'): 4,
 (',', '9'): 2,
 ('-', '-'): 230,
 ('-', '1'): 2291,
 ('-', '2'): 870,
 ('-', '3'): 302,
 ('-', '4'): 106,
 ('-', '5'): 59,
 ('-', '6'): 44,
 ('-', '7'): 46,
 ('-', '8'): 39,
 ('-', '9'): 31,
 ('-', 'a'): 71,
 ('-', 'b'): 85,
 ('-', 'c'): 93,
 ('-', 'd'): 69,
 ('-', 'e'): 34,
 ('-', 'f'): 42,
 ('-', 'g'): 20,
 ('-', 'h'): 35,
 ('-', 'i'): 55,
 ('-', 'j'): 5,
 ('-', 'k'): 9,
 ('-', 'l'): 166,
 ('-', 'm'): 69,
 ('-', 'n'): 7,
 ('-', 'o'): 76,
 ('-', 'p'): 67,
 ('-', 'q'): 23,
 ('-', 'r'): 178,
 ('-', 's'): 111,
 ('-', 't'): 130,
 ('-', 'u'): 17,
 ('-', 'v'): 7,
 ('-', 'w'): 24,
 ('-', 'y'): 76,
 ('.', '-'): 8,
 ('.', '.'): 32,
 ('.', '0'): 26,
 ('.', '1'): 73,
 ('.', '2'): 76,
 ('.', '3'): 52,
 ('.', '4'): 56,
 ('.', '5'): 108,
 ('.', '6'): 67,
 ('.', '7'): 63,
 ('.', '8'): 67,
 ('.', '9'): 71,
 ('.', 'a'): 10,
 ('.', 'b'): 3,
 ('.', 'c'): 12,
 ('.', 'd'): 7,
 ('.', 'h'): 2,
 ('.', 'i'): 2,
 ('.', 'j'): 14,
 ('.', 'k'): 14,
 ('.', 'l'): 4,
 ('.', 'm'): 6,
 ('.', 'n'): 3,
 ('.', 'p'): 6,
 ('.', 'r'): 4,
 ('.', 's'): 235,
 ('.', 't'): 2,
 ('.', 'v'): 4,
 ('.', 'w'): 2,
 ('.', 'y'): 8,
 ('/', '1'): 5,
 ('/', '2'): 29,
 ('/', '3'): 10,
 ('/', '4'): 40,
 ('/', '5'): 1,
 ('/', '8'): 52,
 ('/', 'c'): 4,
 ('/', 'd'): 1,
 ('/', 'f'): 3,
 ('/', 'i'): 1,
 ('/', 'm'): 8,
 ('/', 'p'): 2,
 ('/', 'w'): 1,
 ('0', ','): 75,
 ('0', '-'): 61,
 ('0', '.'): 55,
 ('0', '0'): 500,
 ('0', '1'): 21,
 ('0', '2'): 9,
 ('0', '3'): 13,
 ('0', '4'): 9,
 ('0', '5'): 17,
 ('0', '6'): 8,
 ('0', '7'): 14,
 ('0', '8'): 7,
 ('0', '9'): 20,
 ('0', '\\'): 1,
 ('0', 's'): 25,
 ('0', 't'): 2,
 ('0', 'u'): 1,
 ('1', ','): 30,
 ('1', '-'): 7,
 ('1', '.'): 95,
 ('1', '0'): 187,
 ('1', '1'): 78,
 ('1', '2'): 102,
 ('1', '3'): 97,
 ('1', '4'): 80,
 ('1', '5'): 123,
 ('1', '6'): 69,
 ('1', '7'): 59,
 ('1', '8'): 64,
 ('1', '9'): 397,
 ('1', ':'): 1,
 ('1', '\\'): 52,
 ('1', 'b'): 2,
 ('1', 's'): 1,
 ('1', 't'): 1,
 ('2', '%'): 2,
 ('2', ','): 17,
 ('2', '-'): 12,
 ('2', '.'): 84,
 ('2', '0'): 122,
 ('2', '1'): 50,
 ('2', '2'): 41,
 ('2', '3'): 43,
 ('2', '4'): 39,
 ('2', '5'): 88,
 ('2', '6'): 33,
 ('2', '7'): 29,
 ('2', '8'): 31,
 ('2', '9'): 29,
 ('2', '\\'): 7,
 ('3', ','): 13,
 ('3', '-'): 6,
 ('3', '.'): 87,
 ('3', '0'): 132,
 ('3', '1'): 30,
 ('3', '2'): 34,
 ('3', '3'): 23,
 ('3', '4'): 21,
 ('3', '5'): 33,
 ('3', '6'): 22,
 ('3', '7'): 35,
 ('3', '8'): 25,
 ('3', '9'): 17,
 ('3', ':'): 1,
 ('3', '\\'): 45,
 ('4', ','): 9,
 ('4', '-'): 6,
 ('4', '.'): 53,
 ('4', '0'): 65,
 ('4', '1'): 24,
 ('4', '2'): 24,
 ('4', '3'): 25,
 ('4', '4'): 22,
 ('4', '5'): 52,
 ('4', '6'): 18,
 ('4', '7'): 26,
 ('4', '8'): 16,
 ('4', '9'): 22,
 ('4', '\\'): 1,
 ('5', ','): 47,
 ('5', '-'): 6,
 ('5', '.'): 42,
 ('5', '0'): 184,
 ('5', '1'): 21,
 ('5', '2'): 20,
 ('5', '3'): 24,
 ('5', '4'): 16,
 ('5', '5'): 34,
 ('5', '6'): 19,
 ('5', '7'): 24,
 ('5', '8'): 20,
 ('5', '9'): 18,
 ('5', '\\'): 13,
 ('6', ','): 12,
 ('6', '-'): 4,
 ('6', '.'): 61,
 ('6', '0'): 46,
 ('6', '1'): 18,
 ('6', '2'): 32,
 ('6', '3'): 23,
 ('6', '4'): 29,
 ('6', '5'): 22,
 ('6', '6'): 14,
 ('6', '7'): 13,
 ('6', '8'): 10,
 ('6', '9'): 13,
 ('6', '\\'): 2,
 ('7', ','): 5,
 ('7', '-'): 8,
 ('7', '.'): 80,
 ('7', '0'): 47,
 ('7', '1'): 13,
 ('7', '2'): 24,
 ('7', '3'): 17,
 ('7', '4'): 11,
 ('7', '5'): 47,
 ('7', '6'): 22,
 ('7', '7'): 23,
 ('7', '8'): 10,
 ('7', '9'): 21,
 ('7', '\\'): 15,
 ('8', ','): 8,
 ('8', '-'): 5,
 ('8', '.'): 64,
 ('8', '0'): 45,
 ('8', '1'): 11,
 ('8', '2'): 21,
 ('8', '3'): 17,
 ('8', '4'): 27,
 ('8', '5'): 39,
 ('8', '6'): 20,
 ('8', '7'): 51,
 ('8', '8'): 63,
 ('8', '9'): 56,
 ('9', ','): 2,
 ('9', '-'): 6,
 ('9', '.'): 38,
 ('9', '0'): 94,
 ('9', '1'): 32,
 ('9', '2'): 28,
 ('9', '3'): 18,
 ('9', '4'): 14,
 ('9', '5'): 28,
 ('9', '6'): 18,
 ('9', '7'): 42,
 ('9', '8'): 194,
 ('9', '9'): 118,
 ('9', '\\'): 1,
 ('9', 'y'): 1,
 (':', '1'): 1,
 (':', '3'): 1,
 ('?', '*'): 45,
 ('\\', '/'): 157,
 ('`', '`'): 702,
 ('a', '*'): 7,
 ('a', '-'): 35,
 ('a', '.'): 51,
 ('a', 'a'): 5,
 ('a', 'b'): 597,
 ('a', 'c'): 1301,
 ('a', 'd'): 1344,
 ('a', 'e'): 64,
 ('a', 'f'): 205,
 ('a', 'g'): 811,
 ('a', 'h'): 46,
 ('a', 'i'): 1579,
 ('a', 'j'): 57,
 ('a', 'k'): 427,
 ('a', 'l'): 3240,
 ('a', 'm'): 995,
 ('a', 'n'): 5987,
 ('a', 'o'): 11,
 ('a', 'p'): 725,
 ('a', 'q'): 7,
 ('a', 'r'): 4162,
 ('a', 's'): 2510,
 ('a', 't'): 4002,
 ('a', 'u'): 449,
 ('a', 'v'): 558,
 ('a', 'w'): 190,
 ('a', 'x'): 59,
 ('a', 'y'): 1041,
 ('a', 'z'): 57,
 ('b', '-'): 250,
 ('b', '.'): 12,
 ('b', 'a'): 658,
 ('b', 'b'): 29,
 ('b', 'c'): 12,
 ('b', 'd'): 2,
 ('b', 'e'): 1632,
 ('b', 'f'): 1,
 ('b', 'h'): 2,
 ('b', 'i'): 610,
 ('b', 'j'): 22,
 ('b', 'l'): 540,
 ('b', 'm'): 15,
 ('b', 'n'): 1,
 ('b', 'o'): 759,
 ('b', 'p'): 1,
 ('b', 'r'): 258,
 ('b', 's'): 96,
 ('b', 't'): 69,
 ('b', 'u'): 788,
 ('b', 'v'): 3,
 ('b', 'y'): 471,
 ('c', '$'): 2,
 ('c', "'"): 1,
 ('c', '-'): 6,
 ('c', '.'): 136,
 ('c', 'a'): 1647,
 ('c', 'b'): 30,
 ('c', 'c'): 203,
 ('c', 'd'): 7,
 ('c', 'e'): 1947,
 ('c', 'f'): 3,
 ('c', 'g'): 28,
 ('c', 'h'): 1967,
 ('c', 'i'): 864,
 ('c', 'k'): 695,
 ('c', 'l'): 533,
 ('c', 'm'): 18,
 ('c', 'n'): 4,
 ('c', 'o'): 3169,
 ('c', 'q'): 66,
 ('c', 'r'): 523,
 ('c', 's'): 82,
 ('c', 't'): 1292,
 ('c', 'u'): 625,
 ('c', 'y'): 94,
 ('c', 'z'): 6,
 ('d', "'"): 1,
 ('d', '-'): 92,
 ('d', '.'): 47,
 ('d', 'a'): 573,
 ('d', 'b'): 7,
 ('d', 'c'): 8,
 ('d', 'd'): 167,
 ('d', 'e'): 2697,
 ('d', 'f'): 10,
 ('d', 'g'): 102,
 ('d', 'h'): 4,
 ('d', 'i'): 1648,
 ('d', 'j'): 11,
 ('d', 'l'): 71,
 ('d', 'm'): 78,
 ('d', 'n'): 31,
 ('d', 'o'): 576,
 ('d', 'p'): 1,
 ('d', 'q'): 8,
 ('d', 'r'): 224,
 ('d', 's'): 401,
 ('d', 't'): 19,
 ('d', 'u'): 544,
 ('d', 'v'): 93,
 ('d', 'w'): 40,
 ('d', 'y'): 110,
 ('e', '-'): 192,
 ('e', '.'): 12,
 ('e', '\\'): 3,
 ('e', 'a'): 2301,
 ('e', 'b'): 123,
 ('e', 'c'): 1652,
 ('e', 'd'): 3737,
 ('e', 'e'): 1071,
 ('e', 'f'): 412,
 ('e', 'g'): 388,
 ('e', 'h'): 87,
 ('e', 'i'): 459,
 ('e', 'j'): 14,
 ('e', 'k'): 155,
 ('e', 'l'): 1587,
 ('e', 'm'): 935,
 ('e', 'n'): 3810,
 ('e', 'o'): 201,
 ('e', 'p'): 597,
 ('e', 'q'): 128,
 ('e', 'r'): 6358,
 ('e', 's'): 4673,
 ('e', 't'): 1498,
 ('e', 'u'): 76,
 ('e', 'v'): 620,
 ('e', 'w'): 597,
 ('e', 'x'): 925,
 ('e', 'y'): 465,
 ('e', 'z'): 9,
 ('f', '-'): 18,
 ('f', '.'): 25,
 ('f', 'a'): 465,
 ('f', 'b'): 1,
 ('f', 'c'): 1,
 ('f', 'e'): 711,
 ('f', 'f'): 539,
 ('f', 'i'): 1138,
 ('f', 'k'): 4,
 ('f', 'l'): 139,
 ('f', 'm'): 1,
 ('f', 'o'): 1477,
 ('f', 'r'): 579,
 ('f', 's'): 26,
 ('f', 't'): 235,
 ('f', 'u'): 423,
 ('f', 'y'): 23,
 ('f', 'z'): 1,
 ('g', '-'): 35,
 ('g', '.'): 10,
 ('g', 'a'): 503,
 ('g', 'd'): 11,
 ('g', 'e'): 1441,
 ('g', 'f'): 3,
 ('g', 'g'): 75,
 ('g', 'h'): 615,
 ('g', 'i'): 433,
 ('g', 'j'): 1,
 ('g', 'k'): 2,
 ('g', 'l'): 124,
 ('g', 'm'): 30,
 ('g', 'n'): 228,
 ('g', 'o'): 430,
 ('g', 'r'): 707,
 ('g', 's'): 218,
 ('g', 't'): 48,
 ('g', 'u'): 231,
 ('g', 'w'): 1,
 ('g', 'y'): 59,
 ('h', '*'): 122,
 ('h', '-'): 54,
 ('h', '.'): 15,
 ('h', 'a'): 3141,
 ('h', 'b'): 7,
 ('h', 'c'): 1,
 ('h', 'd'): 13,
 ('h', 'e'): 7323,
 ('h', 'f'): 3,
 ('h', 'h'): 6,
 ('h', 'i'): 1659,
 ('h', 'k'): 4,
 ('h', 'l'): 43,
 ('h', 'm'): 31,
 ('h', 'n'): 113,
 ('h', 'o'): 1184,
 ('h', 'q'): 3,
 ('h', 'r'): 181,
 ('h', 's'): 94,
 ('h', 't'): 284,
 ('h', 'u'): 156,
 ('h', 'w'): 7,
 ('h', 'y'): 55,
 ('i', '-'): 25,
 ('i', '.'): 3,
 ('i', 'a'): 1021,
 ('i', 'b'): 189,
 ('i', 'c'): 2229,
 ('i', 'd'): 1535,
 ('i', 'e'): 1117,
 ('i', 'f'): 439,
 ('i', 'g'): 788,
 ('i', 'h'): 11,
 ('i', 'i'): 31,
 ('i', 'j'): 7,
 ('i', 'k'): 122,
 ('i', 'l'): 2041,
 ('i', 'm'): 643,
 ('i', 'n'): 8477,
 ('i', 'o'): 2636,
 ('i', 'p'): 263,
 ('i', 'q'): 18,
 ('i', 'r'): 968,
 ('i', 's'): 2532,
 ('i', 't'): 3318,
 ('i', 'u'): 41,
 ('i', 'v'): 835,
 ('i', 'w'): 12,
 ('i', 'x'): 99,
 ('i', 'y'): 1,
 ('i', 'z'): 167,
 ('j', '.'): 27,
 ('j', 'a'): 235,
 ('j', 'e'): 87,
 ('j', 'i'): 29,
 ('j', 'o'): 236,
 ('j', 'r'): 13,
 ('j', 'u'): 215,
 ('k', '-'): 50,
 ('k', '.'): 18,
 ('k', 'a'): 79,
 ('k', 'b'): 7,
 ('k', 'd'): 9,
 ('k', 'e'): 1106,
 ('k', 'f'): 7,
 ('k', 'g'): 1,
 ('k', 'h'): 11,
 ('k', 'i'): 294,
 ('k', 'j'): 1,
 ('k', 'k'): 5,
 ('k', 'l'): 60,
 ('k', 'm'): 2,
 ('k', 'n'): 65,
 ('k', 'o'): 54,
 ('k', 'p'): 6,
 ('k', 'r'): 16,
 ('k', 's'): 296,
 ('k', 't'): 2,
 ('k', 'u'): 16,
 ('k', 'w'): 5,
 ('k', 'y'): 40,
 ('l', '-'): 65,
 ('l', '.'): 22,
 ('l', 'a'): 1627,
 ('l', 'b'): 12,
 ('l', 'c'): 60,
 ('l', 'd'): 949,
 ('l', 'e'): 2199,
 ('l', 'f'): 95,
 ('l', 'g'): 3,
 ('l', 'h'): 5,
 ('l', 'i'): 2257,
 ('l', 'k'): 65,
 ('l', 'l'): 2421,
 ('l', 'm'): 60,
 ('l', 'n'): 5,
 ('l', 'o'): 1067,
 ('l', 'p'): 102,
 ('l', 'r'): 149,
 ('l', 's'): 527,
 ('l', 't'): 354,
 ('l', 'u'): 355,
 ('l', 'v'): 71,
 ('l', 'w'): 24,
 ('l', 'y'): 1190,
 ('l', 'z'): 2,
 ('m', '-'): 23,
 ('m', '.'): 16,
 ('m', 'a'): 2045,
 ('m', 'b'): 289,
 ('m', 'c'): 56,
 ('m', 'd'): 1,
 ('m', 'e'): 2235,
 ('m', 'f'): 8,
 ('m', 'h'): 2,
 ('m', 'i'): 1380,
 ('m', 'k'): 1,
 ('m', 'l'): 1,
 ('m', 'm'): 357,
 ('m', 'n'): 17,
 ('m', 'o'): 1150,
 ('m', 'p'): 1038,
 ('m', 'r'): 436,
 ('m', 's'): 260,
 ('m', 't'): 2,
 ('m', 'u'): 211,
 ('m', 'w'): 2,
 ('m', 'y'): 57,
 ('n', "'"): 329,
 ('n', '-'): 78,
 ('n', '.'): 55,
 ('n', '\\'): 9,
 ('n', 'a'): 1184,
 ('n', 'b'): 24,
 ('n', 'c'): 1306,
 ('n', 'd'): 3353,
 ('n', 'e'): 2525,
 ('n', 'f'): 155,
 ('n', 'g'): 3507,
 ('n', 'h'): 25,
 ('n', 'i'): 972,
 ('n', 'j'): 12,
 ('n', 'k'): 328,
 ('n', 'l'): 153,
 ('n', 'm'): 123,
 ('n', 'n'): 234,
 ('n', 'o'): 926,
 ('n', 'p'): 10,
 ('n', 'q'): 7,
 ('n', 'r'): 66,
 ('n', 's'): 1558,
 ('n', 't'): 3391,
 ('n', 'u'): 296,
 ('n', 'v'): 356,
 ('n', 'w'): 40,
 ('n', 'x'): 9,
 ('n', 'y'): 572,
 ('n', 'z'): 11,
 ('o', "'"): 6,
 ('o', '-'): 56,
 ('o', '.'): 86,
 ('o', 'a'): 208,
 ('o', 'b'): 294,
 ('o', 'c'): 683,
 ('o', 'd'): 481,
 ('o', 'e'): 88,
 ('o', 'f'): 2846,
 ('o', 'g'): 293,
 ('o', 'h'): 94,
 ('o', 'i'): 270,
 ('o', 'j'): 14,
 ('o', 'k'): 217,
 ('o', 'l'): 1020,
 ('o', 'm'): 2096,
 ('o', 'n'): 5587,
 ('o', 'o'): 540,
 ('o', 'p'): 664,
 ('o', 'r'): 4187,
 ('o', 's'): 1019,
 ('o', 't'): 901,
 ('o', 'u'): 2234,
 ('o', 'v'): 662,
 ('o', 'w'): 904,
 ('o', 'x'): 21,
 ('o', 'y'): 84,
 ('o', 'z'): 17,
 ('p', '*'): 44,
 ('p', '-'): 12,
 ('p', '.'): 150,
 ('p', 'a'): 1525,
 ('p', 'b'): 32,
 ('p', 'c'): 9,
 ('p', 'e'): 1297,
 ('p', 'f'): 5,
 ('p', 'h'): 189,
 ('p', 'i'): 347,
 ('p', 'j'): 10,
 ('p', 'k'): 7,
 ('p', 'l'): 714,
 ('p', 'm'): 44,
 ('p', 'o'): 1085,
 ('p', 'p'): 431,
 ('p', 'r'): 1956,
 ('p', 's'): 123,
 ('p', 't'): 248,
 ('p', 'u'): 370,
 ('p', 'w'): 2,
 ('p', 'y'): 25,
 ('q', 'u'): 422,
 ('r', "'"): 1,
 ('r', '*'): 41,
 ('r', '-'): 120,
 ('r', '.'): 425,
 ('r', '\\'): 3,
 ('r', 'a'): 2564,
 ('r', 'b'): 308,
 ('r', 'c'): 471,
 ('r', 'd'): 670,
 ('r', 'e'): 5771,
 ('r', 'f'): 47,
 ('r', 'g'): 455,
 ('r', 'h'): 29,
 ('r', 'i'): 2146,
 ('r', 'j'): 1,
 ('r', 'k'): 609,
 ('r', 'l'): 353,
 ('r', 'm'): 531,
 ('r', 'n'): 675,
 ('r', 'o'): 2421,
 ('r', 'p'): 273,
 ('r', 'q'): 2,
 ('r', 'r'): 475,
 ('r', 's'): 1809,
 ('r', 't'): 1259,
 ('r', 'u'): 414,
 ('r', 'v'): 191,
 ('r', 'w'): 38,
 ('r', 'y'): 606,
 ('s', '$'): 4,
 ('s', '&'): 37,
 ('s', '-'): 27,
 ('s', '.'): 331,
 ('s', '\\'): 1,
 ('s', 'a'): 1486,
 ('s', 'b'): 40,
 ('s', 'c'): 410,
 ('s', 'd'): 91,
 ('s', 'e'): 2820,
 ('s', 'f'): 28,
 ('s', 'g'): 6,
 ('s', 'h'): 1085,
 ('s', 'i'): 1654,
 ('s', 'k'): 122,
 ('s', 'l'): 186,
 ('s', 'm'): 170,
 ('s', 'n'): 19,
 ('s', 'o'): 1084,
 ('s', 'p'): 559,
 ('s', 'q'): 9,
 ('s', 'r'): 20,
 ('s', 's'): 1193,
 ('s', 't'): 3840,
 ('s', 'u'): 1042,
 ('s', 'v'): 7,
 ('s', 'w'): 107,
 ('s', 'x'): 23,
 ('s', 'y'): 98,
 ('t', '&'): 5,
 ('t', '*'): 1609,
 ('t', '-'): 135,
 ('t', '.'): 66,
 ('t', 'a'): 1413,
 ('t', 'b'): 13,
 ('t', 'c'): 98,
 ('t', 'd'): 27,
 ('t', 'e'): 3645,
 ('t', 'f'): 18,
 ('t', 'g'): 24,
 ('t', 'h'): 8649,
 ('t', 'i'): 3676,
 ('t', 'j'): 5,
 ('t', 'l'): 287,
 ('t', 'm'): 201,
 ('t', 'n'): 45,
 ('t', 'o'): 3746,
 ('t', 'p'): 10,
 ('t', 'r'): 1686,
 ('t', 's'): 1619,
 ('t', 't'): 479,
 ('t', 'u'): 742,
 ('t', 'v'): 13,
 ('t', 'w'): 195,
 ('t', 'x'): 1,
 ('t', 'y'): 513,
 ('t', 'z'): 17,
 ('u', '*'): 744,
 ('u', '.'): 248,
 ('u', 'a'): 374,
 ('u', 'b'): 258,
 ('u', 'c'): 586,
 ('u', 'd'): 303,
 ('u', 'e'): 511,
 ('u', 'f'): 69,
 ('u', 'g'): 365,
 ('u', 'h'): 1,
 ('u', 'i'): 387,
 ('u', 'j'): 14,
 ('u', 'k'): 14,
 ('u', 'l'): 902,
 ('u', 'm'): 392,
 ('u', 'n'): 1276,
 ('u', 'o'): 23,
 ('u', 'p'): 453,
 ('u', 'q'): 2,
 ('u', 'r'): 1488,
 ('u', 's'): 1319,
 ('u', 't'): 1554,
 ('u', 'u'): 4,
 ('u', 'v'): 4,
 ('u', 'x'): 9,
 ('u', 'y'): 127,
 ('u', 'z'): 5,
 ('v', '.'): 36,
 ('v', 'a'): 310,
 ('v', 'e'): 2529,
 ('v', 'i'): 866,
 ('v', 'j'): 1,
 ('v', 'o'): 203,
 ('v', 'r'): 4,
 ('v', 's'): 2,
 ('v', 'v'): 1,
 ('v', 'y'): 13,
 ('w', '-'): 22,
 ('w', '.'): 10,
 ('w', 'a'): 893,
 ('w', 'b'): 1,
 ('w', 'c'): 4,
 ('w', 'd'): 10,
 ('w', 'e'): 903,
 ('w', 'f'): 3,
 ('w', 'g'): 2,
 ('w', 'h'): 824,
 ('w', 'i'): 1079,
 ('w', 'k'): 1,
 ('w', 'l'): 20,
 ('w', 'm'): 8,
 ('w', 'n'): 244,
 ('w', 'o'): 638,
 ('w', 'p'): 5,
 ('w', 'r'): 66,
 ('w', 's'): 129,
 ('w', 't'): 47,
 ('w', 'u'): 1,
 ('w', 'w'): 1,
 ('w', 'y'): 26,
 ('x', '-'): 23,
 ('x', 'a'): 73,
 ('x', 'c'): 126,
 ('x', 'e'): 178,
 ('x', 'h'): 8,
 ('x', 'i'): 36,
 ('x', 'n'): 4,
 ('x', 'o'): 29,
 ('x', 'p'): 313,
 ('x', 't'): 125,
 ('x', 'u'): 6,
 ('x', 'w'): 4,
 ('x', 'x'): 3,
 ('x', 'y'): 1,
 ('y', '-'): 70,
 ('y', '.'): 9,
 ('y', '\\'): 4,
 ('y', 'a'): 34,
 ('y', 'b'): 9,
 ('y', 'c'): 18,
 ('y', 'd'): 6,
 ('y', 'e'): 733,
 ('y', 'f'): 3,
 ('y', 'g'): 2,
 ('y', 'i'): 211,
 ('y', 'l'): 30,
 ('y', 'm'): 75,
 ('y', 'n'): 34,
 ('y', 'o'): 329,
 ('y', 'p'): 33,
 ('y', 'q'): 2,
 ('y', 'r'): 30,
 ('y', 's'): 473,
 ('y', 't'): 28,
 ('y', 'u'): 1,
 ('y', 'w'): 8,
 ('y', 'z'): 1,
 ('z', '.'): 3,
 ('z', 'a'): 56,
 ('z', 'd'): 2,
 ('z', 'e'): 143,
 ('z', 'h'): 3,
 ('z', 'i'): 57,
 ('z', 'l'): 4,
 ('z', 'n'): 1,
 ('z', 'o'): 11,
 ('z', 'u'): 4,
 ('z', 'y'): 7,
 ('z', 'z'): 6}

The problem with this approach is that we end up with a dictionary and not an nltk FreqDist object. Therefore we are deprived of handy methods like plot and most_common.

Another approach is to first remove the ones with the white space from the bigrams and then construct a FreqDist that will already not count the spaces.


In [34]:
bigrams = nltk.bigrams(text_lowercase)
bigrams_without_space = [bigram for bigram in bigrams if ' ' not in bigram]

In [36]:
bigrams_frequency = nltk.FreqDist(bigrams_without_space)
bigrams_frequency.plot(10)

In [ ]: